library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.5 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(rstan)
## Loading required package: StanHeaders
## rstan (Version 2.21.3, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
##
## Attaching package: 'rstan'
## The following object is masked from 'package:tidyr':
##
## extract
library(bayesplot)
## This is bayesplot version 1.8.1
## - Online documentation and vignettes at mc-stan.org/bayesplot
## - bayesplot theme set to bayesplot::theme_default()
## * Does _not_ affect other ggplot2 plots
## * See ?bayesplot_theme_set for details on theme setting
#Using code Yue shared to include picture
knitr::include_graphics("/Users/brendaonyango/Desktop/IMG_4318 copy.png")
If the chain is mixing too slowly it will have only explored a limited range in the first several thousands iterations. It will overestimate the plausibility of pi values in this range and underestimate the plasuibility of values outside the range.
The chain having a high correlation will not estimate the correct peak, if there is one, in the posterior.
Chains that get stuck are overestimating in the values where they are stuck and will produce peaks in the posterior that are not there.
MCMC diagnostics are important because simulations are not perfect and diagnostics, used holistically, can identify how to improve a MCMC chain.
MCMC simulations are helpful because they are an efficient and flexible alternative to grid approximations. Grid approximations for more complex problems take a long time and grid approximations are not good for estimating posteriors when there are multiple dimensions. MCMCs can.
Rstan combines R with the Stan “engine”; Stan is written in C++ and is good for Bayesian modeling.
How to put Markov chains in applied language for social problems/questions.
We’re given Y|pi~Bin(n, pi) and pi~Beta(3, 8) and n = 10 with Y = 2.
#step 1: defining a grid with 5 pi values
grid_data <- data.frame(pi_grid = seq(from = 0, to = 1, length = 5))
# step 2: evaluating prior and liklihood at each pi
grid_data <- grid_data |>
mutate(prior = dbeta(pi_grid, 3,8),
liklihood =dbinom(2,10, pi_grid))
Next is approximating the posterior using the product of the liklihood and prior and then normalizing them.
#step 3: approximate the posterior
grid_data <- grid_data |>
mutate(unnormalized = liklihood * prior,
posterior = unnormalized/sum(unnormalized))
#confirming that posterior approximation sums to 1
grid_data |> summarize(sum(unnormalized), sum(posterior))
## sum(unnormalized) sum(posterior)
## 1 0.8765603 1
Next we’ll examine the grid approxiation posterior rounded to 2 decimal places.
#step 4
round(grid_data, 2) #2 is indicating how many decimal points to use
## pi_grid prior liklihood unnormalized posterior
## 1 0.00 0.00 0.00 0.00 0.00
## 2 0.25 3.00 0.28 0.85 0.96
## 3 0.50 0.70 0.04 0.03 0.04
## 4 0.75 0.01 0.00 0.00 0.00
## 5 1.00 0.00 0.00 0.00 0.00
Finally, we’ll plot this model.
# plotting the grid approximation posterior
ggplot(grid_data, aes(x = pi_grid, y = posterior)) +
geom_point() +
geom_segment(aes(x = pi_grid, xend = pi_grid, y = 0, yend = posterior))
#step 1
grd_data <- data.frame(p_grid = seq(from = 0, to = 1, length = 201)) #changed length to 201
#step 2
grd_data <- grd_data |> #changed names of data, prior, liklihood to keep distinct from part a
mutate(prir = dbeta(p_grid, 3,8), #prior remained the same
liklhood = dbinom(2,10, p_grid)) #successes and outcomes remained teh same
#step 3
grd_data <- grd_data |>
mutate(unnormalized = liklhood * prir,
posterir = unnormalized/sum(unnormalized))
#confirming that posterior approximation sums to 1
grd_data |> summarize(sum(unnormalized), sum(posterir))
## sum(unnormalized) sum(posterir)
## 1 41.79567 1
#step 4
round(grd_data, 2)
## p_grid prir liklhood unnormalized posterir
## 1 0.00 0.00 0.00 0.00 0.00
## 2 0.00 0.01 0.00 0.00 0.00
## 3 0.01 0.03 0.00 0.00 0.00
## 4 0.01 0.07 0.01 0.00 0.00
## 5 0.02 0.13 0.02 0.00 0.00
## 6 0.03 0.19 0.02 0.00 0.00
## 7 0.03 0.26 0.03 0.01 0.00
## 8 0.04 0.34 0.04 0.01 0.00
## 9 0.04 0.43 0.05 0.02 0.00
## 10 0.04 0.53 0.06 0.03 0.00
## 11 0.05 0.63 0.07 0.05 0.00
## 12 0.06 0.73 0.09 0.06 0.00
## 13 0.06 0.84 0.10 0.08 0.00
## 14 0.06 0.95 0.11 0.11 0.00
## 15 0.07 1.06 0.12 0.13 0.00
## 16 0.07 1.17 0.14 0.16 0.00
## 17 0.08 1.29 0.15 0.19 0.00
## 18 0.09 1.40 0.16 0.22 0.01
## 19 0.09 1.51 0.17 0.26 0.01
## 20 0.10 1.62 0.18 0.30 0.01
## 21 0.10 1.72 0.19 0.33 0.01
## 22 0.10 1.83 0.20 0.37 0.01
## 23 0.11 1.93 0.21 0.41 0.01
## 24 0.12 2.02 0.22 0.45 0.01
## 25 0.12 2.12 0.23 0.49 0.01
## 26 0.12 2.21 0.24 0.53 0.01
## 27 0.13 2.30 0.25 0.57 0.01
## 28 0.14 2.38 0.26 0.61 0.01
## 29 0.14 2.45 0.26 0.65 0.02
## 30 0.14 2.53 0.27 0.68 0.02
## 31 0.15 2.60 0.28 0.72 0.02
## 32 0.16 2.66 0.28 0.75 0.02
## 33 0.16 2.72 0.29 0.78 0.02
## 34 0.16 2.77 0.29 0.80 0.02
## 35 0.17 2.82 0.29 0.83 0.02
## 36 0.18 2.87 0.30 0.85 0.02
## 37 0.18 2.91 0.30 0.87 0.02
## 38 0.18 2.94 0.30 0.88 0.02
## 39 0.19 2.97 0.30 0.89 0.02
## 40 0.20 3.00 0.30 0.90 0.02
## 41 0.20 3.02 0.30 0.91 0.02
## 42 0.21 3.04 0.30 0.92 0.02
## 43 0.21 3.05 0.30 0.92 0.02
## 44 0.22 3.06 0.30 0.92 0.02
## 45 0.22 3.06 0.30 0.91 0.02
## 46 0.22 3.06 0.30 0.91 0.02
## 47 0.23 3.06 0.29 0.90 0.02
## 48 0.24 3.05 0.29 0.89 0.02
## 49 0.24 3.04 0.29 0.88 0.02
## 50 0.24 3.02 0.29 0.86 0.02
## 51 0.25 3.00 0.28 0.85 0.02
## 52 0.26 2.98 0.28 0.83 0.02
## 53 0.26 2.96 0.27 0.81 0.02
## 54 0.26 2.93 0.27 0.79 0.02
## 55 0.27 2.90 0.26 0.77 0.02
## 56 0.28 2.87 0.26 0.74 0.02
## 57 0.28 2.83 0.25 0.72 0.02
## 58 0.29 2.79 0.25 0.70 0.02
## 59 0.29 2.75 0.24 0.67 0.02
## 60 0.30 2.71 0.24 0.65 0.02
## 61 0.30 2.67 0.23 0.62 0.01
## 62 0.30 2.62 0.23 0.60 0.01
## 63 0.31 2.58 0.22 0.57 0.01
## 64 0.32 2.53 0.22 0.55 0.01
## 65 0.32 2.48 0.21 0.52 0.01
## 66 0.32 2.43 0.20 0.50 0.01
## 67 0.33 2.38 0.20 0.47 0.01
## 68 0.34 2.32 0.19 0.45 0.01
## 69 0.34 2.27 0.19 0.43 0.01
## 70 0.35 2.22 0.18 0.40 0.01
## 71 0.35 2.16 0.18 0.38 0.01
## 72 0.36 2.11 0.17 0.36 0.01
## 73 0.36 2.05 0.16 0.34 0.01
## 74 0.36 2.00 0.16 0.32 0.01
## 75 0.37 1.94 0.15 0.30 0.01
## 76 0.38 1.89 0.15 0.28 0.01
## 77 0.38 1.83 0.14 0.26 0.01
## 78 0.38 1.78 0.14 0.24 0.01
## 79 0.39 1.72 0.13 0.23 0.01
## 80 0.40 1.67 0.13 0.21 0.01
## 81 0.40 1.61 0.12 0.19 0.00
## 82 0.41 1.56 0.12 0.18 0.00
## 83 0.41 1.51 0.11 0.17 0.00
## 84 0.42 1.45 0.11 0.15 0.00
## 85 0.42 1.40 0.10 0.14 0.00
## 86 0.42 1.35 0.10 0.13 0.00
## 87 0.43 1.30 0.09 0.12 0.00
## 88 0.44 1.25 0.09 0.11 0.00
## 89 0.44 1.20 0.08 0.10 0.00
## 90 0.44 1.16 0.08 0.09 0.00
## 91 0.45 1.11 0.08 0.08 0.00
## 92 0.46 1.06 0.07 0.08 0.00
## 93 0.46 1.02 0.07 0.07 0.00
## 94 0.47 0.98 0.07 0.06 0.00
## 95 0.47 0.93 0.06 0.06 0.00
## 96 0.48 0.89 0.06 0.05 0.00
## 97 0.48 0.85 0.06 0.05 0.00
## 98 0.48 0.81 0.05 0.04 0.00
## 99 0.49 0.78 0.05 0.04 0.00
## 100 0.50 0.74 0.05 0.03 0.00
## 101 0.50 0.70 0.04 0.03 0.00
## 102 0.50 0.67 0.04 0.03 0.00
## 103 0.51 0.64 0.04 0.02 0.00
## 104 0.52 0.60 0.04 0.02 0.00
## 105 0.52 0.57 0.03 0.02 0.00
## 106 0.52 0.54 0.03 0.02 0.00
## 107 0.53 0.51 0.03 0.02 0.00
## 108 0.54 0.48 0.03 0.01 0.00
## 109 0.54 0.46 0.03 0.01 0.00
## 110 0.54 0.43 0.02 0.01 0.00
## 111 0.55 0.41 0.02 0.01 0.00
## 112 0.56 0.38 0.02 0.01 0.00
## 113 0.56 0.36 0.02 0.01 0.00
## 114 0.57 0.34 0.02 0.01 0.00
## 115 0.57 0.32 0.02 0.01 0.00
## 116 0.58 0.30 0.02 0.00 0.00
## 117 0.58 0.28 0.01 0.00 0.00
## 118 0.58 0.26 0.01 0.00 0.00
## 119 0.59 0.24 0.01 0.00 0.00
## 120 0.60 0.23 0.01 0.00 0.00
## 121 0.60 0.21 0.01 0.00 0.00
## 122 0.60 0.20 0.01 0.00 0.00
## 123 0.61 0.18 0.01 0.00 0.00
## 124 0.62 0.17 0.01 0.00 0.00
## 125 0.62 0.16 0.01 0.00 0.00
## 126 0.62 0.15 0.01 0.00 0.00
## 127 0.63 0.14 0.01 0.00 0.00
## 128 0.64 0.13 0.01 0.00 0.00
## 129 0.64 0.12 0.01 0.00 0.00
## 130 0.64 0.11 0.00 0.00 0.00
## 131 0.65 0.10 0.00 0.00 0.00
## 132 0.66 0.09 0.00 0.00 0.00
## 133 0.66 0.08 0.00 0.00 0.00
## 134 0.66 0.08 0.00 0.00 0.00
## 135 0.67 0.07 0.00 0.00 0.00
## 136 0.68 0.06 0.00 0.00 0.00
## 137 0.68 0.06 0.00 0.00 0.00
## 138 0.69 0.05 0.00 0.00 0.00
## 139 0.69 0.05 0.00 0.00 0.00
## 140 0.70 0.04 0.00 0.00 0.00
## 141 0.70 0.04 0.00 0.00 0.00
## 142 0.70 0.03 0.00 0.00 0.00
## 143 0.71 0.03 0.00 0.00 0.00
## 144 0.72 0.03 0.00 0.00 0.00
## 145 0.72 0.03 0.00 0.00 0.00
## 146 0.72 0.02 0.00 0.00 0.00
## 147 0.73 0.02 0.00 0.00 0.00
## 148 0.74 0.02 0.00 0.00 0.00
## 149 0.74 0.02 0.00 0.00 0.00
## 150 0.74 0.01 0.00 0.00 0.00
## 151 0.75 0.01 0.00 0.00 0.00
## 152 0.76 0.01 0.00 0.00 0.00
## 153 0.76 0.01 0.00 0.00 0.00
## 154 0.76 0.01 0.00 0.00 0.00
## 155 0.77 0.01 0.00 0.00 0.00
## 156 0.78 0.01 0.00 0.00 0.00
## 157 0.78 0.01 0.00 0.00 0.00
## 158 0.78 0.00 0.00 0.00 0.00
## 159 0.79 0.00 0.00 0.00 0.00
## 160 0.80 0.00 0.00 0.00 0.00
## 161 0.80 0.00 0.00 0.00 0.00
## 162 0.80 0.00 0.00 0.00 0.00
## 163 0.81 0.00 0.00 0.00 0.00
## 164 0.82 0.00 0.00 0.00 0.00
## 165 0.82 0.00 0.00 0.00 0.00
## 166 0.83 0.00 0.00 0.00 0.00
## 167 0.83 0.00 0.00 0.00 0.00
## 168 0.84 0.00 0.00 0.00 0.00
## 169 0.84 0.00 0.00 0.00 0.00
## 170 0.84 0.00 0.00 0.00 0.00
## 171 0.85 0.00 0.00 0.00 0.00
## 172 0.86 0.00 0.00 0.00 0.00
## 173 0.86 0.00 0.00 0.00 0.00
## 174 0.86 0.00 0.00 0.00 0.00
## 175 0.87 0.00 0.00 0.00 0.00
## 176 0.88 0.00 0.00 0.00 0.00
## 177 0.88 0.00 0.00 0.00 0.00
## 178 0.88 0.00 0.00 0.00 0.00
## 179 0.89 0.00 0.00 0.00 0.00
## 180 0.90 0.00 0.00 0.00 0.00
## 181 0.90 0.00 0.00 0.00 0.00
## 182 0.90 0.00 0.00 0.00 0.00
## 183 0.91 0.00 0.00 0.00 0.00
## 184 0.92 0.00 0.00 0.00 0.00
## 185 0.92 0.00 0.00 0.00 0.00
## 186 0.92 0.00 0.00 0.00 0.00
## 187 0.93 0.00 0.00 0.00 0.00
## 188 0.94 0.00 0.00 0.00 0.00
## 189 0.94 0.00 0.00 0.00 0.00
## 190 0.95 0.00 0.00 0.00 0.00
## 191 0.95 0.00 0.00 0.00 0.00
## 192 0.96 0.00 0.00 0.00 0.00
## 193 0.96 0.00 0.00 0.00 0.00
## 194 0.96 0.00 0.00 0.00 0.00
## 195 0.97 0.00 0.00 0.00 0.00
## 196 0.98 0.00 0.00 0.00 0.00
## 197 0.98 0.00 0.00 0.00 0.00
## 198 0.98 0.00 0.00 0.00 0.00
## 199 0.99 0.00 0.00 0.00 0.00
## 200 1.00 0.00 0.00 0.00 0.00
## 201 1.00 0.00 0.00 0.00 0.00
Next I’ll plot this approximation.
ggplot(grd_data, aes(x = p_grid, y = posterir)) +
geom_point() +
geom_segment(aes(x = p_grid, xend = p_grid, y = 0, yend = posterir))
We can see that b gives a much better estimate than a.
We’re given a Gamma-Poison model for Y|lambda~Pois(lambda) and lambda~Gamma(20,5). N = 3 and (Y1, Y2, Y3) = (0, 1, 0).
# Step 1: Define a grid of 10 lambda values
grid_data <- data.frame(lambda_grid = seq(from = 0, to = 8, length = 8))
# Step 2: Evaluate the prior & likelihood at each lambda
grid_data <- grid_data |>
mutate(prior = dgamma(lambda_grid, 20, 5), #given prior
likelihood = dpois(0, lambda_grid) * dpois(1, lambda_grid) * dpois(0, lambda_grid))
# Step 3: Approximate the posterior
grid_data <- grid_data |>
mutate(unnormalized = likelihood * prior,
posterior = unnormalized / sum(unnormalized))
# Set the seed
set.seed(1500)
# Step 4: sample from the discretized posterior
post_sample <- sample_n(grid_data, size = 10000,
weight = posterior, replace = TRUE)
#plotting histogram of the above
ggplot(post_sample, aes(x = lambda_grid)) +
geom_histogram(aes(y = ..density..), color = "white") +
stat_function(fun = dgamma, args = list(20, 5)) +
lims(x = c(0, 8))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
# Step 1: Define a grid of 10 lambda values
grid_data <- data.frame(lambda_grid = seq(from = 0, to = 8, length = 201)) # went up to 10 values to see what happens after 8
# Step 2: Evaluate the prior & likelihood at each lambda
grid_data <- grid_data |>
mutate(prior = dgamma(lambda_grid, 20, 5), #given prior
likelihood = dpois(0, lambda_grid) * dpois(1, lambda_grid) * dpois(0, lambda_grid))
# Step 3: Approximate the posterior
grid_data <- grid_data |>
mutate(unnormalized = likelihood * prior,
posterior = unnormalized / sum(unnormalized))
# Set the seed
set.seed(1500)
# Step 4: sample from the discretized posterior
post_sample <- sample_n(grid_data, size = 10000,
weight = posterior, replace = TRUE)
#plotting histogram of the above
ggplot(post_sample, aes(x = lambda_grid)) +
geom_histogram(aes(y = ..density..), color = "white") +
stat_function(fun = dgamma, args = list(20, 5)) +
lims(x = c(0, 8))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).